{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np \n",
    "import seaborn as sns\n",
    "import matplotlib \n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import skew\n",
    "from scipy.stats.stats import pearsonr\n",
    "\n",
    "# 读取数据集\n",
    "train = pd.read_csv('train.csv')\n",
    "test = pd.read_csv('test.csv')\n",
    "\n",
    "# 处理 SalePrice \n",
    "prices = pd.DataFrame({'price':train['SalePrice'], 'log(price+1)':np.log1p(train['SalePrice'])})\n",
    "# prices.hist()\n",
    "train['SalePrice'] = np.log1p(train['SalePrice'])\n",
    "\n",
    "# GrLivArea \n",
    "train['GrLivArea'] = train['GrLivArea'][train['GrLivArea']<4500]\n",
    "\n",
    "# 合并训练集和测试机\n",
    "all_data = pd.concat((train.loc[:,\"MSSubClass\":\"SaleCondition\"],\n",
    "                     test.loc[:, \"MSSubClass\":\"SaleCondition\"]))\n",
    "\n",
    "# 处理 偏斜>0.5\n",
    "numeric_feats = all_data.dtypes[all_data.dtypes!='object'].index\n",
    "skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))\n",
    "skewed_feats = skewed_feats[skewed_feats > 0.5]\n",
    "skewed_feats = skewed_feats.index\n",
    "all_data[skewed_feats] = np.log1p(all_data[skewed_feats])\n",
    "\n",
    "# 转换成虚拟变量\n",
    "all_data = pd.get_dummies(all_data)\n",
    "all_data = all_data.fillna(all_data.mean())\n",
    "\n",
    "X_train = all_data[:train.shape[0]]\n",
    "X_test = all_data[train.shape[0]:]\n",
    "y = train[\"SalePrice\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(2917, 79)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd \n",
    "import numpy as np \n",
    "import seaborn as sns\n",
    "import matplotlib \n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import skew\n",
    "from scipy.stats.stats import pearsonr\n",
    "from scipy.special import boxcox1p\n",
    "from scipy.stats import boxcox_normmax\n",
    "\n",
    "\n",
    "# 读取数据集\n",
    "train = pd.read_csv('train.csv')\n",
    "test = pd.read_csv('test.csv')\n",
    "\n",
    "train_ID = train['Id']; test_ID = test['Id']\n",
    "train.drop(['Id'], axis=1, inplace=True)\n",
    "test.drop(['Id'], axis=1, inplace=True)\n",
    "\n",
    "train = train[train.GrLivArea < 4500]\n",
    "train.reset_index(drop=True, inplace=True)\n",
    "\n",
    "train['SalePrice'] = np.log1p(train['SalePrice'])\n",
    "y = train.SalePrice.reset_index(drop=True)\n",
    "train_features = train.drop(['SalePrice'], axis=1)\n",
    "test_features = test\n",
    "\n",
    "features = pd.concat([train_features, test_features]).reset_index(drop=True)\n",
    "print(features.shape)\n",
    "features['MSSubClass'] = features['MSSubClass'].apply(str)\n",
    "features['YrSold'] = features['YrSold'].astype(str)\n",
    "features['MoSold'] = features['MoSold'].astype(str)\n",
    "\n",
    "features['Functional'] = features['Functional'].fillna('Typ')\n",
    "features['Electrical'] = features['Electrical'].fillna('SBrkr')\n",
    "features['KitchenQual'] = features['KitchenQual'].fillna('TA')\n",
    "features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])\n",
    "features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])\n",
    "features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])\n",
    "features['PoolQC'] = features['PoolQC'].fillna('None')\n",
    "\n",
    "for feat in ['GarageYrBlt', 'GarageArea', 'GarageCars']:\n",
    "    features[feat] = features[feat].fillna(0)\n",
    "for feat in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:\n",
    "    features[feat].fillna('None')\n",
    "for feat in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']:\n",
    "    features[feat].fillna('None')\n",
    "features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))\n",
    "\n",
    "objects = features.dtypes[features.dtypes=='object'].index\n",
    "features.update(features[objects].fillna('None'))\n",
    "features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))\n",
    "\n",
    "numerics = features.dtypes[features.dtypes!='object'].index\n",
    "skew_features = features[numerics].apply(lambda x: skew(x)).sort_values(ascending=False)\n",
    "high_skew = skew_features[skew_features > 0.5]\n",
    "skew_index = high_skew.index\n",
    "for i in skew_index:\n",
    "    features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))\n",
    "\n",
    "\n",
    "    \n",
    "features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']\n",
    "features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']\n",
    "\n",
    "features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +\n",
    "                                 features['1stFlrSF'] + features['2ndFlrSF'])\n",
    "\n",
    "features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +\n",
    "                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))\n",
    "\n",
    "features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +\n",
    "                              features['EnclosedPorch'] + features['ScreenPorch'] +\n",
    "                              features['WoodDeckSF'])\n",
    "\n",
    "# simplified features\n",
    "features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)\n",
    "features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)\n",
    "features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)\n",
    "features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)\n",
    "features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)\n",
    "\n",
    "    \n",
    "\n",
    "other_feats = X[X.isnull().sum()[X.isnull().sum() != 0].index]\n",
    "for feat in other_feats:\n",
    "    features[feat] = features[feat].fillna(0)\n",
    "    \n",
    "X = pd.get_dummies(features)\n",
    "X_train = X[:len(y)]\n",
    "X_test = X[len(y):]\n",
    "\n",
    "overfit = []\n",
    "for feat in X.columns:\n",
    "    counts = X[i].value_counts()\n",
    "    zeros = counts.iloc[0]\n",
    "    if zeros / len(X) * 100 > 99.94:\n",
    "        overfit.append(i)\n",
    "\n",
    "X_train = X_train.drop(overfit, axis=1).copy()\n",
    "X_test = X_test.drop(overfit, axis=1).copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ridge:  0.11168469485302249 0.006239395881046382\n",
      "Lasso:  0.10852487811653169 0.006505856972357174\n",
      "ela:  0.10852686521478012 0.006469188908459015\n",
      "svr:  0.13588143757711868 0.009461465491795013\n",
      "[22:48:36] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Voyager\\Anaconda3\\lib\\site-packages\\xgboost\\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n",
      "C:\\Users\\Voyager\\Anaconda3\\lib\\site-packages\\xgboost\\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[22:49:10] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Voyager\\Anaconda3\\lib\\site-packages\\xgboost\\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[22:49:45] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Voyager\\Anaconda3\\lib\\site-packages\\xgboost\\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[22:50:21] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Voyager\\Anaconda3\\lib\\site-packages\\xgboost\\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[22:50:57] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n",
      "xgboost:  0.11205245075036237 0.006208896979159475\n",
      "gbr:  0.1151789248419383 0.007841011860906279\n",
      "lightgbm:  0.11575148572286598 0.005497098894733727\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Voyager\\Anaconda3\\lib\\site-packages\\xgboost\\core.py:587: FutureWarning: Series.base is deprecated and will be removed in a future version\n",
      "  if getattr(data, 'base', None) is not None and \\\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[22:53:21] WARNING: C:/Jenkins/workspace/xgboost-win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import Ridge, RidgeCV, ElasticNetCV, LassoCV, LassoLarsCV\n",
    "from sklearn.svm import SVR\n",
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "\n",
    "from sklearn.model_selection import cross_val_score, train_test_split, KFold\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
    "from sklearn.pipeline import make_pipeline \n",
    "\n",
    "from xgboost import XGBRegressor\n",
    "from lightgbm import LGBMRegressor\n",
    "\n",
    "\n",
    "kfolds = KFold(n_splits=10, shuffle=True, random_state=42)\n",
    "\n",
    "\n",
    "def rmse_cv(model):\n",
    "    rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring='neg_mean_squared_error', cv=5))\n",
    "    return rmse\n",
    "\n",
    "\n",
    "\"\"\"Ridge\"\"\"\n",
    "alpha_ridge = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]\n",
    "ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alpha_ridge, cv=kfolds))\n",
    "\n",
    "\"\"\"Lasso\"\"\"\n",
    "alpha_lasso = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]\n",
    "lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e6,\n",
    "                                               alphas=alpha_lasso,\n",
    "                                               random_state=42,\n",
    "                                               cv=kfolds))\n",
    "\"\"\"Elasticnet\"\"\"\n",
    "alpha_ela = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]\n",
    "e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]\n",
    "ela = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e6,\n",
    "                                                   alphas=alpha_ela,\n",
    "                                                   cv=kfolds,\n",
    "                                                   random_state=42,\n",
    "                                                    l1_ratio = e_l1ratio\n",
    "                                                   ))\n",
    "\"\"\"SVR\"\"\"\n",
    "svr = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.008, gamma=0.0003))\n",
    "\n",
    "\"\"\"xgboost\"\"\"\n",
    "xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,\n",
    "                      max_depth=3, min_child_weight=0,\n",
    "                      gamma = 0, subsample=0.7,\n",
    "                      colsample_bytree=0.7,\n",
    "                      objective='reg:linear', nthread=-1,\n",
    "                      scale_pos_weight=1, seed=27,\n",
    "                      reg_alpha=0.00006, random_state=42)\n",
    "\n",
    "\"\"\"gbr\"\"\"\n",
    "gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,\n",
    "                          max_depth=4, max_features='sqrt',\n",
    "                          min_samples_leaf=15, min_samples_split=10,\n",
    "                          loss='huber', random_state=42)\n",
    "\n",
    "\"\"\"lightgbm\"\"\"\n",
    "lightgbm = LGBMRegressor(objective='regression',\n",
    "                         num_leaves=4,\n",
    "                         learning_rate=0.01,\n",
    "                         n_estimators=5000,\n",
    "                         max_bin=200,\n",
    "                         bagging_fraction=0.75,\n",
    "                         bagging_freq=5,\n",
    "                         bagging_seed=7,\n",
    "                         feature_fraction=0.2,\n",
    "                         feature_fraction_seed=7,\n",
    "                         verbose=-1)\n",
    "\n",
    "\n",
    "score = rmse_cv(ridge)\n",
    "print('Ridge: ', score.mean(), score.std())\n",
    "score = rmse_cv(lasso)\n",
    "print('Lasso: ', score.mean(), score.std())\n",
    "score = rmse_cv(ela)\n",
    "print('ela: ', score.mean(), score.std())\n",
    "score = rmse_cv(svr)\n",
    "print('svr: ', score.mean(), score.std())\n",
    "score = rmse_cv(xgboost)\n",
    "print('xgboost: ', score.mean(), score.std())\n",
    "score = rmse_cv(gbr)\n",
    "print('gbr: ', score.mean(), score.std())\n",
    "score = rmse_cv(lightgbm)\n",
    "print('lightgbm: ', score.mean(), score.std())\n",
    "\n",
    "\n",
    "ridge = ridge.fit(X_train, y)\n",
    "lasso = lasso.fit(X_train, y)\n",
    "ela = ela.fit(X_train, y)\n",
    "xgboost = xgboost.fit(X_train, y)\n",
    "gbr = gbr.fit(X_train, y)\n",
    "lightgbm = lightgbm.fit(X_train, y)\n",
    "\n",
    "\n",
    "pred = 0.3*lasso.predict(X_test) + \\\n",
    "        0.1*ridge.predict(X_test) + \\\n",
    "        0.1*ela.predict(X_test) + \\\n",
    "        0.3*xgboost.predict(X_test) + \\\n",
    "        0.1*gbr.predict(X_test) + \\\n",
    "        0.1*lightgbm.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = pd.DataFrame({'id':test_ID, 'SalePrice': np.expm1(pred)})\n",
    "res.to_csv('submission_2019_12_19.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
